all_meta <- read_csv(file = here::here('data/drawings/stringent_cleaned_dataset_meta/all_object_metadata_cleaned.csv')) %>%
as_tibble() %>%
filter(age_numeric>2) %>%
mutate(category = str_split_fixed(category,' ',2)[,2]) %>%
mutate(category = str_replace(category,' ','.')) # ice cream
## Parsed with column specification:
## cols(
## session_id = col_character(),
## category = col_character(),
## age = col_character(),
## num_strokes = col_double(),
## draw_duration_old = col_double(),
## draw_duration_new = col_double(),
## mean_intensity = col_double(),
## age_numeric = col_double(),
## filename = col_character()
## )
num_subs = length(unique(all_meta$session_id))
The final, filtered dataset of N=34119 drawings from 48 categories from 6853 children who were on average 5.6870365years of age (range 3-10 years).
frequency = read_csv(file = here::here('data/surveys/drawing_experience/preprocessed/Category_frequency_survey.csv')) %>%
filter(childs_age > 2) # a 2-year-old's parent got in there somehow..
## Parsed with column specification:
## cols(
## childs_age = col_double(),
## subject_id = col_character(),
## category = col_character(),
## often_drawn_rating = col_double()
## )
To assess this, 50 parents of children aged 3-10 years filled out a survey asking about the frequency with with their children drew the categories in the dataset.
count_by_age <- frequency %>%
filter(childs_age > 2) %>%
group_by(childs_age) %>%
dplyr::summarize(num_surveys = length(unique(subject_id)))
## `summarise()` ungrouping output (override with `.groups` argument)
freq_by_category <- frequency %>%
filter(childs_age > 2) %>%
mutate(category = str_split_fixed(category,' ',2)[,2]) %>%
mutate(category = str_replace(category,' ','.')) %>%
filter(category %in% all_meta$category) %>%
group_by(category) %>%
summarize(drawing_frequency = mean(often_drawn_rating)) %>%
mutate(above_median_freq = drawing_frequency > median(drawing_frequency))
## `summarise()` ungrouping output (override with `.groups` argument)
# write_csv(freq_by_category, here::here('data/surveys/drawing_experience/preprocessed/freq_by_category.csv'))
all_tracings <- read_csv(here('data/tracing/rated_all_museumstation_filtered.csv'))%>%
select(-X1, -X)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X = col_double(),
## session_id = col_character(),
## age = col_double(),
## category = col_character(),
## pre_tran = col_double(),
## post_tran = col_double(),
## rotate = col_double(),
## translate = col_double(),
## scale = col_double(),
## has_ref = col_logical(),
## rating = col_double()
## )
## Make averages for joining
by_subject_tracing_avg <- all_tracings %>%
distinct(session_id, category, age, rating) %>%
group_by(session_id) %>%
summarize(avg_tracing_rating = mean(rating))
## `summarise()` ungrouping output (override with `.groups` argument)
animacy_csv <- read_csv(here::here('data/drawings/category_metadata/animacy.csv')) %>%
as_tibble() %>%
mutate(animacy_size = case_when(animacy == '0' & size=='0' ~ 2,
animacy == '0' & size=='1' ~ 1,
animacy == '1' & size=='0' ~ 3,
animacy == '1' & size=='1' ~ 4))
## Parsed with column specification:
## cols(
## animacy = col_double(),
## size = col_double(),
## category = col_character()
## )
num_batches=232
reg_string = 'C_0.1_T_0.1'
classification_data <- read.csv(here::here('data','compiled_classifications/',paste0(reg_string, 'batchtotal_',as.character(num_batches),'.csv'))) %>%
mutate(session_id = paste('cdm_',session_id,sep="")) %>%
mutate(age_numeric = age) %>%
mutate(age = paste('age',age,sep="")) %>%
mutate(age = as.factor(age)) %>%
mutate(category = target_label) %>%
mutate(image_name = paste(target_label,'_sketch_', age,'_', session_id,'.png',sep="")) %>%
select(-X) %>%
filter(age_numeric >2) %>%
mutate(category = str_replace(category,' ','.')) # ice cream = ice.crea
d <- classification_data %>%
mutate(correct_or_not = as.logical(correct_or_not)) %>%
gather(key = 'class', value = 'prob', contains('prob')) %>%
mutate(class = str_split_fixed(class, '_prob',2)[,1]) %>%
group_by(image_name, age, category, correct_or_not, session_id, age_numeric) %>%
summarize(denom = sum(prob), target_label_prob = prob[class==category], log_odds = log(target_label_prob / (denom - target_label_prob))) %>%
rename(filename = image_name) %>%
left_join(all_meta, by=c("filename", "category", "age_numeric","session_id")) %>%
mutate(draw_duration = draw_duration_old) %>%
mutate(run = substr(session_id,0,10)) %>%
left_join(freq_by_category)
## `summarise()` regrouping output by 'image_name', 'age', 'category', 'correct_or_not', 'session_id' (override with `.groups` argument)
## Joining, by = "category"
d <- d %>%
left_join(by_subject_tracing_avg)
## Joining, by = "session_id"
# weird things were happening with category matching, check
assert_that(length(unique(d$filename)) == length(unique(classification_data$image_name)))
## [1] TRUE
# every drawing should have all of these, regardless
assert_that(sum(is.na(d$age_numeric))==0)
## [1] TRUE
assert_that(sum(is.na(d$category))==0)
## [1] TRUE
assert_that(sum(is.na(d$correct_or_not))==0)
## [1] TRUE
missing_meta <- d %>%
filter(is.na(num_strokes))
assert_that(length(missing_meta$filename)==0)
## [1] TRUE
### How do our covariates change with age? Compute means and CIs; Group by age/category
## first summarize data
cor_by_age <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_cor")
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
# cor_by_age_by_session <- d %>%
# group_by(session_id, age_numeric) %>%
# summarize(avg_cor = mean(correct_or_not)) %>%
# group_by(age_numeric) %>%
# multi_boot_standard(col = "avg_cor")
draw_duration <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_draw_duration = mean(draw_duration)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_draw_duration")
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
num_strokes <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_num_strokes = mean(num_strokes)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_num_strokes")
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
avg_intensity <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_intensity = mean(mean_intensity)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_intensity")
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
tracing_scores <- d %>%
distinct(session_id,age_numeric,avg_tracing_rating) %>%
filter(!is.na(avg_tracing_rating)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_tracing_rating")
## Make compiled plot of descriptives
base_size_chosen=12 # size of text in plots
smooth_alpha=.2
cor_by_age_plot_A = ggplot(cor_by_age, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Classification accuracy') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
geom_smooth(col='grey', alpha=smooth_alpha) +
ylim(0,.75) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey") +
ggtitle('A')
p1=ggplot(draw_duration, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Drawing duration (s)') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
ylim(0,15) +
geom_smooth(col='grey', span = 10) +
ggtitle('B')
p2=ggplot(avg_intensity, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Ink used (mean intensity)') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
ylim(.02,.06) +
geom_smooth(col='grey', span = 10,alpha=smooth_alpha) +
ggtitle('C')
p3=ggplot(num_strokes, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Number of strokes') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
ylim(0,20) +
geom_smooth(col='grey', span = 10,alpha=smooth_alpha) +
ggtitle('D')
p4=ggplot(tracing_scores, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Estimated tracing score') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
ylim(1,5) +
geom_smooth(col='grey', span = 10,alpha=smooth_alpha) +
ggtitle('E')
ggarrange(cor_by_age_plot_A,p1,p2,p3,p4, nrow=2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## adding dummy grobs
ggsave('figures/mainResults.pdf',width=7.5, height=3, units='in')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
num_bins = 3
cor_by_age_by_strokes <- d %>%
ungroup() %>%
mutate(bin = ntile(num_strokes, num_bins)) %>%
group_by(bin, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric,bin) %>%
multi_boot_standard(col = "avg_cor") %>%
mutate(covariate = 'by strokes drawn')
## `summarise()` regrouping output by 'bin', 'age_numeric' (override with `.groups` argument)
cor_by_age_by_time <- d %>%
ungroup() %>%
mutate(bin = ntile(draw_duration, num_bins)) %>%
group_by(bin, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric,bin) %>%
multi_boot_standard(col = "avg_cor") %>%
mutate(covariate = 'by time spent')
## `summarise()` regrouping output by 'bin', 'age_numeric' (override with `.groups` argument)
cor_by_age_by_intensity <- d %>%
ungroup() %>%
mutate(bin = ntile(mean_intensity, num_bins)) %>%
group_by(bin, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric,bin) %>%
multi_boot_standard(col = "avg_cor") %>%
mutate(covariate = 'by ink used')
## `summarise()` regrouping output by 'bin', 'age_numeric' (override with `.groups` argument)
cor_by_age_by_tracing <- d %>%
ungroup() %>%
mutate(bin = ntile(avg_tracing_rating, num_bins)) %>%
group_by(bin, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric,bin) %>%
multi_boot_standard(col = "avg_cor") %>%
mutate(covariate = 'by tracing ability')
## `summarise()` regrouping output by 'bin', 'age_numeric' (override with `.groups` argument)
merged <- cor_by_age_by_intensity %>%
full_join(cor_by_age_by_strokes) %>%
full_join(cor_by_age_by_time) %>%
full_join(cor_by_age_by_tracing) %>%
filter(age_numeric > 2) %>%
mutate(bin_name = as.factor(bin))
## Joining, by = c("age_numeric", "bin", "ci_lower", "ci_upper", "mean", "covariate")
## Joining, by = c("age_numeric", "bin", "ci_lower", "ci_upper", "mean", "covariate")
## Joining, by = c("age_numeric", "bin", "ci_lower", "ci_upper", "mean", "covariate")
# mutate(bin_name = case_when(bin == 1 ~ "Low",
# bin == 2 ~ "Medium",
# bin == 3 ~ "High"))
base_size_chosen=10
ggplot(merged, aes(age_numeric,mean, color=bin_name, group=bin_name, col=bin_name)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), alpha=.6, size=.25) +
theme_few(base_size = 10) +
labs(x='Age (in years)', y='Classification accuracy') +
scale_color_viridis(option="C", begin=.2, end=.8, discrete=TRUE, name = 'Effort') +
scale_x_continuous(breaks=seq(2,10,1)) +
geom_smooth(span=10, alpha=smooth_alpha, size=.4) +
ylim(0,.75) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey") +
theme(legend.position ='right',aspect.ratio = 1) +
# theme(aspect.ratio = 1, legend.position = c(.08, .75),legend.text = element_text(size=6),legend.title = element_text(size=8),legend.background = element_rect(fill=alpha('white', 0))) +
facet_grid(~covariate)
## Warning: Duplicated aesthetics after name standardisation: colour
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave('figures/visuomotor_control.pdf', height=3, units='in')
## Saving 7 x 3 in image
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor_by_session <- d %>%
group_by(age_numeric,session_id) %>%
filter(age_numeric >2) %>%
dplyr::summarize(mean = mean(correct_or_not), num_drawings = n()) %>%
group_by(age_numeric)
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
base_size_chosen=10
ggplot(cor_by_age %>% filter(age_numeric > 2), aes(age_numeric,mean, col=age_numeric)) +
geom_jitter(data=cor_by_session, width=.1, height=.1, alpha=.01) +
geom_pointrange(aes(y=mean, ymin = ci_lower, ymax = ci_upper)) +
geom_smooth(alpha=smooth_alpha, color='grey') +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Classification accuracy') +
scale_x_continuous(breaks = seq(3,10,1)) +
theme(legend.position = "none", aspect.ratio = 1) +
scale_color_viridis(option="D", breaks=seq(3,10,1)) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave('figures/cor_by_session.pdf', width=3, height=3, units='in')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# ggplot(cor_by_session %>% filter(age_numeric > 2), aes(x=as.factor(age_numeric),y=mean, col=age_numeric, size=num_drawings)) +
# geom_flat_violin() +
# geom_point(position = position_jitter(width=.15, height = .01), size = .25)+
# theme_cowplot()+
# guides(fill = FALSE, colour = FALSE)
#
#
# # geom_jitter(data=cor_by_session, width=.1, height=.1, alpha=.01, aes(size=num_drawings)) +
# # geom_pointrange(aes(y=mean, ymin = ci_lower, ymax = ci_upper)) +
# geom_smooth(alpha=smooth_alpha, color='grey') +
# theme_few(base_size = base_size_chosen) +
# labs(x='Age', y='Classification accuracy') +
# scale_color_viridis(option="D") +
# theme(legend.position = "none", aspect.ratio = 1) +
# ylim(0,.75) +
# geom_hline(yintercept = 1/48, linetype="dashed", color="grey")
cor_by_category_by_age <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_log_odds = mean(log_odds), num_drawings = n()) %>%
left_join(freq_by_category) %>%
mutate(category = fct_reorder(category, drawing_frequency, .desc=TRUE))
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
## Joining, by = "category"
ggplot(cor_by_category_by_age, aes(age_numeric,avg_log_odds, color=drawing_frequency, size=num_drawings)) +
geom_point(alpha=.5) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Log odds') +
scale_color_viridis(option="A") +
theme(legend.position = "none") +
geom_smooth(span=10, alpha=smooth_alpha) +
# ylim(0,1) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey") +
facet_wrap(~category, nrow=6)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor_by_category_by_age <- d %>%
filter(age_numeric > 2) %>%
group_by(session_id, age_numeric,category) %>%
dplyr::summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric, category) %>%
multi_boot_standard(col = 'avg_cor') %>%
ungroup () %>%
left_join(freq_by_category) %>%
mutate(category = fct_reorder(category, drawing_frequency))
## `summarise()` regrouping output by 'session_id', 'age_numeric' (override with `.groups` argument)
## Joining, by = "category"
ggplot(cor_by_category_by_age, aes(age_numeric,mean, color=drawing_frequency)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), position=position_dodge(width=.2), alpha=.8) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Proportion correct') +
scale_color_viridis(option="A", begin=.4, end=.8) +
theme(legend.position = "none") +
geom_smooth(span=10, alpha=smooth_alpha) +
# ylim(0,1) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey") +
facet_wrap(~category, nrow=6)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# first summarize data
cor_by_age_low_freq <- d %>%
group_by(above_median_freq, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric, above_median_freq) %>%
multi_boot_standard(col = "avg_cor")
## `summarise()` regrouping output by 'above_median_freq', 'age_numeric' (override with `.groups` argument)
ggplot(cor_by_age_low_freq, aes(age_numeric,mean, color=above_median_freq)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Classification accuracy') +
# scale_color_viridis(option="D") +
theme(legend.position = "none") +
geom_smooth(col='grey',span=10, alpha=smooth_alpha) +
ylim(0,.75) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
lo_correct_category_by_age <- d %>%
filter(age_numeric > 2) %>%
filter(correct_or_not==1) %>%
mutate(category = str_replace(category,'ice.cream','ice cream')) %>% # ice.cream -> ice_cream
mutate(age = cut(age_numeric, c(2.9, 5, 7,10.1), labels = c("3-4","5-6","7-10"))) %>%
group_by(session_id, age,category) %>%
summarize(avg_cor = mean(log_odds), num_drawings = n()) %>%
group_by(age, category) %>%
multi_boot_standard(col = 'avg_cor') %>%
ungroup () %>%
mutate(category = fct_reorder(category, mean))
## `summarise()` regrouping output by 'session_id', 'age' (override with `.groups` argument)
base_size_chosen=12
ggplot(lo_correct_category_by_age, aes(x = category, y = mean, col = age)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), alpha=.6, size=.2) +
coord_flip() +
theme_few(base_size = base_size_chosen) +
labs(y = "Classifier evidence (log odds)", x = "Object category") +
scale_color_viridis(discrete=TRUE, begin=0, end=.8, name = "Age group") +
theme(legend.position = c(.8,.25),legend.text = element_text(size=6), legend.background = element_rect(fill=alpha('white', 0)), aspect.ratio=1.61)
## Warning: Removed 1 rows containing missing values (geom_segment).
ggsave("figures/log_odds_by_category.pdf", width = 3, height = 4.85, units = 'in')
## Warning: Removed 1 rows containing missing values (geom_segment).
dog_probs <- classification_data %>%
select(-image_name, -batch, -batch_str, -X.1, -index, -age, -target_label,-session_id, -correct_or_not, -age_numeric) %>%
filter(category == 'dog') %>%
gather(key = category, value = prob) %>%
group_by(category) %>%
mutate(mean_prob = mean(prob)) %>%
ungroup() %>%
mutate(category = fct_reorder(category, mean_prob, .desc=TRUE))
ggplot(dog_probs, aes(x=category, y=prob)) +
geom_boxplot(alpha=.2) +
theme_few() +
theme(axis.text.x = element_text(angle = 90))
classification_data_long <- classification_data %>%
mutate(correct_or_not = as.logical(correct_or_not)) %>%
gather(key = 'class', value = 'prob', contains('prob')) %>%
mutate(class = str_split_fixed(class, '_prob',2)[,1])
confusions_by_age <- classification_data_long %>%
mutate(drawn_category = category) %>%
left_join(animacy_csv) %>%
ungroup() %>%
filter(correct_or_not==0) %>%
filter(age_numeric > 2) %>%
mutate(drawn_category = fct_reorder(drawn_category, animacy_size)) %>%
mutate(class = factor(class, levels = levels(drawn_category))) %>%
mutate(age_group = cut(age_numeric, c(2.9, 5, 7,10.1), labels = c("3-4","5-6","7-10"))) %>%
group_by(age_group, age_numeric, drawn_category, class) %>%
dplyr::summarize(mean_prob = mean(prob))
## Joining, by = "category"
## `summarise()` regrouping output by 'age_group', 'age_numeric', 'drawn_category' (override with `.groups` argument)
levels(confusions_by_age$drawn_category)
## [1] "airplane" "bed" "bike" "boat" "cactus"
## [6] "car" "chair" "couch" "house" "piano"
## [11] "train" "tree" "TV" "apple" "book"
## [16] "bottle" "bowl" "clock" "cup" "hat"
## [21] "ice.cream" "key" "lamp" "mushroom" "phone"
## [26] "scissors" "watch" "bee" "bird" "cat"
## [31] "dog" "face" "fish" "frog" "hand"
## [36] "rabbit" "snail" "spider" "bear" "camel"
## [41] "cow" "elephant" "horse" "octopus" "person"
## [46] "sheep" "tiger" "whale"
ggplot(data = confusions_by_age, aes(x=class, y=drawn_category,fill=mean_prob)) +
geom_tile() +
theme_few(base_size = 10) +
theme(legend.position = 'none', axis.text.x = element_text(angle = 90, vjust = 1,
size = 6, hjust = 1), axis.text.y = element_text(angle = 0,
size = 6, hjust = 1)) +
coord_fixed() +
scale_fill_viridis(option="A", limits=c(0,quantile(confusions_by_age$mean_prob,.995)), name = 'Classifier probability') +
ylab('Drawn as') +
xlab('Confused as')
# facet_wrap(~age_group, nrow=1)
ggsave('figures/classifier_confusions.pdf', width = 4, height = 5, units = 'in')
animacy_size_acc_by_age <- classification_data_long %>%
mutate(drawn_category = category) %>%
left_join(animacy_csv %>% select(animacy, size, category), by = c("drawn_category" = "category")) %>%
ungroup() %>%
rename(drawn_category_animacy = animacy, drawn_category_size = size) %>%
left_join(animacy_csv %>% select(animacy, size, category), by = c("class" = "category")) %>%
rename(class_animacy = animacy, class_size = size) %>%
group_by(image_name, drawn_category, age_numeric) %>%
mutate(max_prob = max(prob), top_class = prob==max_prob) %>%
filter(top_class==TRUE) %>%
mutate(animacy_correct = (drawn_category_animacy == class_animacy)) %>%
mutate(size_correct = (drawn_category_size == class_size))
# anim_cor_by_category_by_age <- animacy_size_acc_by_age %>%
# filter(correct_or_not==0) %>%
# group_by(age_numeric,drawn_category, drawn_category_animacy) %>%
# summarize(avg_animacy_correct = mean(animacy_correct), num_drawings = n()) %>%
# ungroup() %>%
# mutate(drawn_category = fct_reorder(drawn_category, drawn_category_animacy))
#
# ggplot(anim_cor_by_category_by_age, aes(age_numeric,avg_animacy_correct, color=drawn_category_animacy, size=num_drawings)) +
# geom_point(alpha=.5) +
# theme_few(base_size = base_size_chosen) +
# labs(x='Age', y='Proportion animacy correct') +
# theme(legend.position = "none") +
# geom_smooth(span=10, alpha=smooth_alpha) +
# ylim(0,1) +
# geom_hline(yintercept = .43, linetype="dashed", color="grey") +
# facet_wrap(~drawn_category, nrow=6)
baseline_animacy <- mean(animacy_csv$animacy)
baseline_objects <- 1 - mean(animacy_csv$animacy)
anim_cor_by_category_by_age <- animacy_size_acc_by_age %>%
filter(correct_or_not==0) %>%
mutate(drawn_category_animacy = as.factor(drawn_category_animacy)) %>%
mutate(baseline_chance = case_when(drawn_category_animacy == 0 ~ baseline_objects,
drawn_category_animacy == 1 ~ baseline_animacy)) %>%
group_by(age_numeric, drawn_category) %>%
dplyr::summarize(avg_animacy_correct = mean(animacy_correct) - baseline_chance) %>%
distinct(age_numeric, avg_animacy_correct, drawn_category)
## `summarise()` regrouping output by 'age_numeric', 'drawn_category' (override with `.groups` argument)
anim_cor_by_age <- anim_cor_by_category_by_age %>%
group_by(age_numeric) %>%
multi_boot_standard(col = 'avg_animacy_correct')
ggplot(anim_cor_by_age, aes(x=age_numeric,y=mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), alpha=.8) +
geom_point(data=anim_cor_by_category_by_age, aes(x=age_numeric, y=avg_animacy_correct), alpha=.1) +
theme_few(base_size = 10) +
labs(x='Age', y='Prop. animacy correct') +
theme(legend.position = "none") +
geom_smooth(span=10, alpha=smooth_alpha, color='grey') +
scale_color_viridis(option="D", discrete=FALSE) +
geom_hline(yintercept = 0 , linetype="dashed", color="grey")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave('figures/animacy_classification.pdf', width = 3, height = 3, units = 'in')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
objects_only <- animacy_csv %>%
filter(animacy==0)
baseline_big <- mean(objects_only$size)
baseline_small <- 1 - mean(objects_only$size)
size_cor_by_category_by_age <- animacy_size_acc_by_age %>%
filter(correct_or_not==0) %>%
filter(drawn_category_animacy==0) %>% #only objects
mutate(drawn_category_size = as.factor(drawn_category_size)) %>%
mutate(baseline_chance = case_when(drawn_category_size == 0 ~ baseline_small,
drawn_category_size == 1 ~ baseline_big)) %>%
group_by(age_numeric, drawn_category) %>%
dplyr::summarize(avg_size_correct = mean(size_correct) - baseline_chance) %>%
distinct(age_numeric, avg_size_correct, drawn_category)
## `summarise()` regrouping output by 'age_numeric', 'drawn_category' (override with `.groups` argument)
size_cor_by_age <- size_cor_by_category_by_age %>%
group_by(age_numeric) %>%
multi_boot_standard(col = 'avg_size_correct')
## Plot 4: Plot animacy correct by age
ggplot(size_cor_by_age, aes(x=age_numeric,y=mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), alpha=.8) +
geom_point(data=size_cor_by_category_by_age, aes(x=age_numeric, y=avg_size_correct), alpha=.1) +
theme_few(base_size = 10) +
labs(x='Age', y='Prop. object size correct') +
theme(legend.position = "none") +
geom_smooth(span=10, alpha=smooth_alpha, color='grey') +
scale_color_viridis(option="D", discrete=FALSE) +
geom_hline(yintercept = 0 , linetype="dashed", color="grey")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave('figures/size_classification.pdf', width = 3, height = 3, units = 'in')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
accuracy_age_only <- glmer(correct_or_not ~ age_numeric +
(1|session_id) +
(1|category),
data = d)
## Warning in glmer(correct_or_not ~ age_numeric + (1 | session_id) + (1
## | : calling glmer() with family=gaussian (identity link) as a shortcut to
## lmer() is deprecated; please call lmer() directly
summary(accuracy_age_only)
## Linear mixed model fit by REML ['lmerMod']
## Formula: correct_or_not ~ age_numeric + (1 | session_id) + (1 | category)
## Data: d
##
## REML criterion at convergence: 24188.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.9427 -0.7792 -0.3593 1.0192 2.3191
##
## Random effects:
## Groups Name Variance Std.Dev.
## session_id (Intercept) 0.009361 0.09675
## category (Intercept) 0.028904 0.17001
## Residual 0.183361 0.42821
## Number of obs: 20185, groups: session_id, 6088; category, 48
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.156734 0.026367 5.944
## age_numeric 0.032029 0.001587 20.176
##
## Correlation of Fixed Effects:
## (Intr)
## age_numeric -0.343
accuracy_with_drawing_freq <- glmer(correct_or_not ~ scale(age_numeric) +
scale(drawing_frequency) + (1|session_id) +
(age_numeric|category),
data = d)
## Warning in glmer(correct_or_not ~ scale(age_numeric) +
## scale(drawing_frequency) + : calling glmer() with family=gaussian (identity
## link) as a shortcut to lmer() is deprecated; please call lmer() directly
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.00202053
## (tol = 0.002, component 1)
accuracy_with_drawing_freq = summary(accuracy_with_drawing_freq)
xtable::xtable(summary(accuracy_with_drawing_freq)$coef, digits=3, caption = "Model coefficients of a GLMM predicting the recognziability of each drawing.")
## % latex table generated in R 3.6.1 by xtable 1.8-4 package
## % Mon Oct 12 14:25:24 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrr}
## \hline
## & Estimate & Std. Error & t value \\
## \hline
## (Intercept) & 0.340 & 0.025 & 13.572 \\
## scale(age\_numeric) & 0.067 & 0.006 & 10.707 \\
## scale(drawing\_frequency) & 0.026 & 0.018 & 1.448 \\
## \hline
## \end{tabular}
## \caption{Model coefficients of a GLMM predicting the recognziability of each drawing.}
## \end{table}
We first examined how classification accuracy varied according to the age of the child who produced each drawing as well as the category that was drawn. As expected, we found that classification accuracy based on these visual features increased steadily with the age of the child producing the drawing (STATS, see Figure XX), validating the expectation that older children’s drawings contain visual features that make them more recognizable. Importantly, we observed this developmental trend for many different categories that varied in the degree to which they are commonly drawn by children (see SI Appendix, Figure XX): for example, some object categories in our dataset are frequently drawn by children (e.g., car, tree, person) and others very infrequently drawn (e.g., cactus, whale, scissors). We formally evaluated this by asking parents of children aged 3-10 years to estimate the frequency with their child draws each category (N=50 parents, ) and directly examining how it affected classification performance in a second generalized linear mixed model (), adding this term as a covariate. We did not observe that classification performance was influenced by drawing frequency (STATS) in fact, many infrequently drawn categories (e.g. piano) had relatively high classification rates, and some frequently drawn categories (e.g. dog), had relatively low classification rates and were more likely to be confused with other similar categories (e.g., other animals). Thus, children increasingly include distinctive visual features of object categories in their drawings across childhood, regardless of whether these are objects that they have significant experience drawing or objects that they may have never drawn before.
correct_subset <- d %>%
filter(correct_or_not==1)
log_odds_corr_only <- lmer(log_odds ~ age_numeric +
scale(drawing_frequency) +
(1|session_id) +
(1|category) + (1|run),
data =correct_subset)
## boundary (singular) fit: see ?isSingular
xtable::xtable(summary(log_odds_corr_only)$coef, digits=3, caption = "Model coefficients of a GLMM predicting the 'distinctiveness' (i.e. log-odds probability of selecting the correct label) assigned to correctly classified drawings")
## % latex table generated in R 3.6.1 by xtable 1.8-4 package
## % Mon Oct 12 14:25:25 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrr}
## \hline
## & Estimate & Std. Error & df & t value & Pr($>$$|$t$|$) \\
## \hline
## (Intercept) & -1.407 & 0.112 & 56.388 & -12.586 & 0.000 \\
## age\_numeric & 0.070 & 0.007 & 3087.796 & 10.657 & 0.000 \\
## scale(drawing\_frequency) & 0.025 & 0.110 & 42.334 & 0.230 & 0.819 \\
## \hline
## \end{tabular}
## \caption{Model coefficients of a GLMM predicting the 'distinctiveness' (i.e. log-odds probability of selecting the correct label) assigned to correctly classified drawings}
## \end{table}
We thus restricted our analysis to drawings that the model was able to correclty classify (33% of the balanced subset of drawings, N=6922
accuracy_all_covariates <- glmer(correct_or_not ~ scale(avg_tracing_rating)*scale(age_numeric) +
scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
accuracy_all_covariates_no_int <- glmer(correct_or_not ~ scale(avg_tracing_rating)+scale(age_numeric) +
scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
accuracy_no_age <- glmer(correct_or_not ~ scale(avg_tracing_rating) + scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
accuracy_no_age_or_tracing <- glmer(correct_or_not ~ scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
accuracy_no_tracing <- glmer(correct_or_not ~ scale(age_numeric) +
scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
###
null = r.squaredGLMM(accuracy_no_age_or_tracing)
## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help
## page.
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
no_age = r.squaredGLMM(accuracy_no_age)
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
no_tracing = r.squaredGLMM(accuracy_no_tracing)
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
all = r.squaredGLMM(accuracy_all_covariates)
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
no_int = r.squaredGLMM(accuracy_all_covariates_no_int) # no_int = no interaction between tracing/age
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
To examine the contributions of age and tracing ability to recognizability, we also fit reduced versions of the full model and examined the marginal \(R^2\) [@nakagawa2013general]. The fixed effects in a null model without tracing or age (which mainly captures drawing effort) accounted for very little variance (marginal \(R^2\) = 0.001). Adding only children’s age to the model increased \(R^2\) (marginal \(R^2\) = 0.025) as did only adding tracing (marginal \(R^2\) = 0.025). Adding both factors without their interaction (marginal \(R^2\) = 0.035) had a similar effect to adding both factors and their interaction (marginal \(R^2\) = 0.035). Attesting to the immense variability between individuals and categories, adding random intercepts for individuals and categories (and many more parameters) accounted for a much larger amount of variance (conditional \(R^2\) for full model = 0.406).